import pandas as pd #dataframe manipulation
import numpy as np #for numerical process
import seaborn as sns #for visualization
from matplotlib import pyplot as plt #for visualization
from PIL import Image, ImageDraw #for read the image
Image.MAX_IMAGE_PIXELS = None
import skimage.color
import skimage.util
import imagehash #for calculation hash value of image
import cv2 #for read the image
import os
import re
import requests
import itertools
#import distance
import time
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import wandb
import json
from sklearn.model_selection import train_test_split,KFold,GroupKFold
import tqdm
from tqdm.notebook import tqdm_notebook
import os
import itertools
import shutil
import yaml
import subprocess
from huggingface_hub import notebook_login
notebook_login()
from datasets import load_dataset
data = load_dataset("datadrivenscience/ship-detection",use_auth_token=True)
data_list = data['train'].to_dict()
json_data = json.dumps(data_list)
json_obj = json.loads(json_data)
train_data = pd.DataFrame(json_obj)
train_data['id'].nunique()
train_data.info()
train_data.head()
train_data['total_ship']=train_data.groupby('id')['id'].transform('count')
sns.histplot(x=train_data['total_ship']);
sns.boxplot(x=train_data['total_ship']);
train_data['total_ship'].describe()
def basic_image_info(path):
img_id=[]
img_mode=[]
img_height=[]
img_width=[]
for files in os.listdir(path):
img_id.append(files)
img=Image.open(f"{(os.path.join(path, files))}")
img_mode.append(img.mode)
img_width.append(img.width)
img_height.append(img.height)
return pd.DataFrame({'id':img_id,'img_mode':img_mode,'img_width':img_width,'img_height':img_height})
train_image_basic_info=basic_image_info("/kaggle/working/ship_detection-huggingface/train/")
train_image_basic_info.head()
train_image_basic_info['img_mode'].value_counts()
sns.kdeplot(x=train_image_basic_info['img_width']);
sns.boxplot(y=train_image_basic_info['img_width']);
train_image_basic_info['img_width'].describe()
sns.kdeplot(x=train_image_basic_info['img_height']);
sns.boxplot(y=train_image_basic_info['img_height']);
train_image_basic_info['img_height'].describe()
def rgb_dist_plot(img,ax):
start=0
end=256
for _,color in enumerate(['Red','Green','Blue']):
_=sns.kdeplot(img.histogram()[start:end],label=color,color=color)
_=plt.legend();
start+=256
end+=256
for annot in train_data['total_ship'].unique()[:10]:
fig, axs = plt.subplots(1, 2 ,figsize=(15,5))
img_id=train_data[train_data['total_ship']==annot].head(1)['id'].values[0]
img_file =Image.open(f"/kaggle/working/ship_detection-huggingface/train/{img_id}")
new_size=800
img_file=img_file.resize((new_size,new_size))
axs[0].imshow(img_file)
axs[0].axis('off')
axs[0].set_title(img_id,fontsize=18)
_=rgb_dist_plot(img_file,ax=axs[1])
axs[1].set_title("RGB Color Distribution For "+img_id,fontsize=18)
train_data['bbox_area']=train_data.apply(lambda df:((df['xmax']-df['ymin'])*(df['ymax']-df['ymin'])), axis=1)
sns.kdeplot(np.log2(train_data['bbox_area']));
sns.boxplot(y=np.log2(train_data['bbox_area']));
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
scl=StandardScaler()
iner=[]
for k in range(2,8):
km=KMeans(n_clusters=k).fit(scl.fit_transform(train_data[['bbox_area']]))
iner.append(km.inertia_)
sns.lineplot(x=range(2,8),y=iner);
km=KMeans(n_clusters=3).fit(scl.fit_transform(train_data[['bbox_area']]))
train_data['bbox_clus']=km.labels_
def bbbox_annotate(img,bbox_df,new_size=False):
if new_size:
y_ = img.shape[0]
x_ = img.shape[1]
x_scale = new_size / x_
y_scale = new_size / y_
img=cv2.resize(img,(new_size,new_size))
for index, row in bbox_df.iterrows():
imgs=(cv2.rectangle(img,(int(np.round(row['xmin']*x_scale)),int(np.round(row['ymin']*y_scale))),
(int(np.round(row["xmax"]*x_scale)),int(np.round(row["ymax"]*y_scale))),(255,0,0),5))
return imgs
else:
for index, row in bbox_df.iterrows():
imgs=(cv2.rectangle(img,(row['xmin'],row['ymin']),
(row["xmax"],row["ymax"]),(255,0,0),5))
return imgs
def img_read(path,im ,new_size=False):
img = cv2.imread(f"{(os.path.join(path, im))}")
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
if new_size:
img=cv2.resize(img,(new_size,new_size))
return img
path='/kaggle/working/ship_detection-huggingface/train/'
fig, axs = plt.subplots(3, 1 ,figsize=(8, 10))
i=1
for im,ax in zip(train_data.groupby(['bbox_clus'])['id'].tail(1),axs.flatten()):
ax.imshow(bbbox_annotate(img_read(path,im),train_data[train_data['id']==im].iloc[:,1:5],800))
ax.axis('off')
ax.set_title(f"Cluster {i}")
i+=1
fig, axs = plt.subplots(5, 2 ,figsize=(4, 10))
for i,(annot,ax) in enumerate(zip(train_data['total_ship'].unique()[:10],axs.flatten())):
img_id=train_data[train_data['total_ship']==annot].head(1)['id'].values[0]
ax.imshow(bbbox_annotate(img_read(path,img_id),train_data[train_data['id']==im].iloc[:,1:5],800))
ax.axis('off')
ax.set_title(f"{img_id}")